Environment Prep

if (!require('xlsx')) install.packages('xlsx')
if (!require('fitdistrplus')) install.packages('fitdistrplus')

Helper Function - Fitting/Plotting

We’ll use this a few times to test fit using the fitdistrplus packge, so let’s make a function.

GoodnessPlot <- function(vector, try_list, discrete=FALSE) {
    # given a data distribution uses the fitdistrplus package to 
    # plot goodness of fits agains a list of distributions to try
    #
    # Args: 
    #   vector:  desired distribution
    #   try_list:  list of distributions to test
    #
    # Returns:
    #   a series of plots showing:
    #     Density - closeness of fit for each distribution
    #     QQ - how do the tails match
    #     CDF - how do the cdfs match
    #     PP - how well does the center match
    #   a series of goodness of fit tests
    #      Cramer-von Mises, Kolmogorov-Smirnov and Anderson-Darling

    plot.legend <- try_list
    n <- length(try_list)
    fit_list <- list()
    
    for (i in 1:length(try_list)) {
        name <- paste0("fit", i)
        if (discrete) {
            fit <- fitdist(vector, try_list[i], discrete=T) 
            } else {
            fit <- fitdist(vector, try_list[i])
            }
        fit_list[[name]] <- fit
    }
    
    denscomp(fit_list, legendtext = plot.legend)
    qqcomp(fit_list, legendtext = plot.legend)
    cdfcomp(fit_list, legendtext = plot.legend)
    ppcomp(fit_list, legendtext = plot.legend)
    
    gofstat(fit_list)
}

Kelton 6.5.1

df1 <- read.xlsx('Problem_Dataset_06_01.xls', 1, header=F)
plotdist(df1$X1, histo = TRUE, demp = TRUE)

descdist(df1$X1, boot=500)

## summary statistics
## ------
## min:  2.06   max:  48.65 
## median:  9.015 
## mean:  11.92048 
## estimated sd:  9.832445 
## estimated skewness:  1.790431 
## estimated kurtosis:  6.895655
try <- c("gamma", "weibull", "lnorm", "logis")
GoodnessPlot(df1$X1, try)

## Goodness-of-fit statistics
##                              1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Kolmogorov-Smirnov statistic  0.07461236    0.07934941  0.05044292
## Cramer-von Mises statistic    0.05174037    0.05847634  0.01825121
## Anderson-Darling statistic    0.35230811    0.44537170  0.14142579
##                              4-mle-logis
## Kolmogorov-Smirnov statistic   0.1557695
## Cramer-von Mises statistic     0.1562512
## Anderson-Darling statistic     1.3210442
## 
## Goodness-of-fit criteria
##                                1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Akaike's Information Criterion    288.2076      290.3611    284.9146
## Bayesian Information Criterion    291.6830      293.8364    288.3900
##                                4-mle-logis
## Akaike's Information Criterion    308.5568
## Bayesian Information Criterion    312.0321

It looks like the best option here is the Lognormal Distribution.

fln <- fitdist(df1$X1, "lnorm")
coef <- round(coef(fln), 4)
RV <- rlnorm(10^5, coef[1], coef[2])

hist(df1$X1, prob=TRUE, col="grey", ylim=c(0, .09), main="Density of Data vs Fit of Chosen Distribution")
lines(density(RV), col="blue", lwd=2) 

print(paste0("Simio Command:  Random.Lognormal(", coef[1], ", ", coef[2], ")"))
## [1] "Simio Command:  Random.Lognormal(2.1851, 0.7712)"

Kelton 6.5.2

df2 <- read.xlsx('Problem_Dataset_06_02.xls', 1, header=F)
plotdist(df2$X1, histo = TRUE, demp = TRUE)

descdist(df2$X1, boot=500)

## summary statistics
## ------
## min:  3.8   max:  36.13 
## median:  9.14 
## mean:  10.89234 
## estimated sd:  6.364957 
## estimated skewness:  1.97635 
## estimated kurtosis:  7.933002
try <- c("gamma", "weibull", "lnorm", "logis")
GoodnessPlot(df2$X1, try)

## Goodness-of-fit statistics
##                              1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Kolmogorov-Smirnov statistic  0.09941791     0.1140618  0.06764782
## Cramer-von Mises statistic    0.11444195     0.2008969  0.03973639
## Anderson-Darling statistic    0.71667740     1.3230284  0.26739444
##                              4-mle-logis
## Kolmogorov-Smirnov statistic   0.1251506
## Cramer-von Mises statistic     0.1480473
## Anderson-Darling statistic     1.3305012
## 
## Goodness-of-fit criteria
##                                1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Akaike's Information Criterion    288.3562      296.2684    282.6829
## Bayesian Information Criterion    292.0565      299.9687    286.3832
##                                4-mle-logis
## Akaike's Information Criterion    302.0135
## Bayesian Information Criterion    305.7138

It looks like the best option here is again the Lognormal Distribution.

fln <- fitdist(df2$X1, "lnorm")
coef <- round(coef(fln), 4)
RV <- rlnorm(10^5, coef[1], coef[2])

hist(df2$X1, prob=TRUE, col="grey", ylim=c(0, .11), main="Density of Data vs Fit of Chosen Distribution")
lines(density(RV), col="blue", lwd=2) 

print(paste0("Simio Command:  Random.Lognormal(", coef[1], ", ", coef[2], ")"))
## [1] "Simio Command:  Random.Lognormal(2.2579, 0.4906)"

Kelton 6.5.3

df3 <- read.xlsx('Problem_Dataset_06_03.xls', 1, header=F)
plotdist(df3$X1, histo = TRUE, demp = TRUE)

descdist(df3$X1, boot=500)

## summary statistics
## ------
## min:  1   max:  8 
## median:  3 
## mean:  2.955556 
## estimated sd:  1.536755 
## estimated skewness:  1.02143 
## estimated kurtosis:  4.428947
try <- c("gamma", "weibull", "lnorm")
GoodnessPlot(df3$X1, try, discrete=TRUE)

## Chi-squared statistic:  11.36869 6.950712 17.60503 
## Degree of freedom of the Chi-squared distribution:  2 2 2 
## Chi-squared p-value:  0.003398758 0.03095081 0.0001503542 
##    the p-value may be wrong with some theoretical counts < 5  
## Chi-squared table:
##      obscounts theo 1-mle-gamma theo 2-mle-weibull theo 3-mle-lnorm
## <= 1         7         2.362048           3.540233         1.731231
## <= 2        13        10.830084           9.558281        12.535323
## <= 3        11        12.872701          11.605719        13.198144
## <= 4         8         9.295658           9.679546         8.217889
## > 4          6         9.639510          10.616221         9.317412
## 
## Goodness-of-fit criteria
##                                1-mle-gamma 2-mle-weibull 3-mle-lnorm
## Akaike's Information Criterion    160.2218      162.3428    160.9673
## Bayesian Information Criterion    163.8352      165.9561    164.5806

Here let’s try the Gamma Distribution.

fln <- fitdist(df3$X1, "gamma", discrete=TRUE)
coef <- round(coef(fln), 4)
RV <- rgamma(10^5, coef[1], coef[2])

hist(df3$X1, prob=TRUE, col="grey", ylim=c(0, .44), main="Density of Data vs Fit of Chosen Distribution")
lines(density(RV), col="blue", lwd=2) 

print(paste0("Simio Command:  Random.Gamma(", coef[1], ", ", coef[2], ")"))
## [1] "Simio Command:  Random.Gamma(3.8536, 1.3038)"

References